Week 6 Lab: Geostatistics

Author

Instructors for ENVH 556 Winter 2024

This document was rendered on November 07, 2024.

Setup

# Clear workspace of all objects and unload non-base packages
rm(list = ls(all = TRUE))
if (!is.null(sessionInfo()$otherPkgs)) {
    suppressWarnings(
        lapply(paste('package:', names(sessionInfo()$otherPkgs), sep=""),
               detach, character.only=TRUE, unload=TRUE, force=TRUE)
    )
}

# Load or install 'pacman' for package management
my_repo <- 'http://cran.r-project.org'
if (!require("pacman")) {
    install.packages("pacman", repos = my_repo)
}
Loading required package: pacman
# **SPH server**: need to install rnaturalearthhires like so on the SPH server
if (!require("rnaturalearthhires")) {
    install.packages("rnaturalearthhires", repos = "https://ropensci.r-universe.dev", type = "source")
}
Loading required package: rnaturalearthhires
pacman::p_load(
    tidyverse,                 # Data manipulation and visualization
    # takes a while to install on SPH
    ggspatial,                 # Geospatial extensions for ggplot.  
    maptiles, # maptiles and tmap libraries can be used instead of or in combination with ggplot + ggspatial. maptiles offers more tile-based map flexibility; ggspatial provides the ability to annotate maps easily; tmap offers both static and interactive maps that we won't review in this course. 
    terra, # alternative mapping with raster files
    
    # need for SPH server?
    prettymapr,
    
    rnaturalearth,             # Land features for map layers (remove water locations)
    rnaturalearthhires,        # High-resolution land features 
    sf,                        # Handling spatial objects (modern replacement for 'sp')
    knitr,                     # Formatting tables with kable()
    gstat,                     # Geostatistical methods (e.g., kriging)
    Hmisc,                     # Data description functions like describe()
    scales,                    # Color scale customization for ggplot
    akima,                     # Bivariate interpolation for irregular data
    downloader                 # Downloading files over HTTP/HTTPS
)


# **Mac Users**: If you encounter issues with 'rgdal' on macOS Catalina or newer,
# you may need to install GDAL via terminal commands. Instructions are available [here](https://medium.com/@egiron/how-to-install-gdal-and-qgis-on-macos-catalina-ca690dca4f91).


# create "Datasets" directory if one does not already exist    
dir.create(file.path("Datasets"), showWarnings=FALSE, recursive = TRUE)

# specify data path
data_path <- file.path("Datasets")

# specify the file names and paths
snapshot.file <- "snapshot_3_5_19.csv"
snapshot.path <- file.path(data_path, snapshot.file)
grid.file <- "la_grid_3_5_19.csv"
grid.path <- file.path(data_path, grid.file)

# Download the file if it is not already present
if (!file.exists(snapshot.path)) {
    url <- paste("https://faculty.washington.edu/sheppard/envh556/Datasets", 
                 snapshot.file, sep = '/')
    download.file(url = url, destfile = snapshot.path)
}
if (!file.exists(grid.path)) {
    url <- paste("https://faculty.washington.edu/sheppard/envh556/Datasets", 
                 grid.file, sep = '/')
    download.file(url = url, destfile = grid.path)
}

# Output a warning message if the file cannot be found
if (file.exists(snapshot.path)) {
    snapshot <- read_csv(file = snapshot.path)
} else warning(paste("Can't find", snapshot.file, "!"))
Rows: 449 Columns: 81
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): seasonfac
dbl (80): latitude, longitude, ID, lat_m, long_m, nox, no2, no, ln_nox, ln_n...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
if (file.exists(grid.path)) {
    la_grid <- read_csv(file = grid.path)
} else warning(paste("Can't find", grid.file, "!"))
Rows: 18718 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): native_id
dbl (11): latitude, longitude, lambert_x, lambert_y, D2A1, A1_50, A23_400, P...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# remove temporary variables
rm(url, file_name, file_path, data_path)
Warning in rm(url, file_name, file_path, data_path): object 'url' not found
Warning in rm(url, file_name, file_path, data_path): object 'file_name' not
found
Warning in rm(url, file_name, file_path, data_path): object 'file_path' not
found

Introduction, Purpose, and Acknowledgments

The purpose of this lab is to learn about geostatistical models and to further solidify your understanding of regression for predictive modeling. We will use the MESA Air snapshot data, as described in Mercer et al. (2011). Through this lab, you will gain experience with modern tools for *t he use of geographic (spatial) data in R, creating maps, and adding your data to them**.

Acknowledgments: This lab was developed with significant contributions from Brian High, Chris Zuidema, and Dave Slager.

Note: While this lab has undergone extensive testing, it remains a work in progress. Minor errors may still be present, and some concepts could be expanded upon in future versions.

Getting Started

Resources for Spatial Data in R

The use of spatial data in R is rapidly evolving. Here are several useful resources for learning and staying updated:

  • Geocomputation with R by Robin Lovelace is a recently updated book offering an excellent introduction. Its introductory chapter provides an overview of the “what” and “why” of geocomputation. Additionally, it covers R’s spatial ecosystem and provides historical context for the tools we use in this lab in The history of R-spatial.

  • Spatial Data Analysis with R by Roger Bivand, a pioneer in spatial data tools for R. Published in 2013, the book’s datasets and scripts are available online. Chapter 1 offers an overview of foundational concepts, Chapter 4 provides a quick start for spatial data, and Chapter 8 introduces kriging. The full book is available through the UW library and on the course website.

  • Spatial Data Science by Pebesma & Bivand explains spatial data concepts in R, integrates with many modern packages (e.g., sf, lwgeom, and stars), and complements them with the tidyverse for efficient data handling.

Simple Features

The sf package represents the modern standard for working with spatial data in R, and what we will use in this lab.

As summarized in this vignette, simple features is a formal standard for geographic data and geographic information systems (GIS) that supports both geographic and non-geographic attributes. This standard provides a unified architecture with a geometry component to indicate each feature’s location on Earth. In R, the package sf represents simple features objects, and all sf package functions begin with st_ to denote spatial data type.

There are three classes within sf to represent simple features, but we will focus on the sf class, which operates like a data.frame with an sfc column containing the geometries for each record. Each sfc geometry is composed of sfg, the geometry for an individual feature such as a point or polygon.

Additional sf Package Resources:

Introduction to Coordinate Reference Systems (CRS)

A coordinate reference system (CRS) defines how spatial data is projected onto the earth’s surface. You may have seen latitude and longitude coordinates before, which are based on a global spherical system called WGS 84 (commonly used in GPS systems). However, different CRSs allow us to accurately measure distances and areas within specific regions by “flattening” the earth into a plane. Knowing the projection is essential for accurate distance calculations. Direct calculations using latitude and longitude coordinates are not accurate on a spherical surface, so we cannot use them directly for distances.

For instance, a CRS in meters makes it easy to work with distances in practical units. UTM, for example, is ideal for local scales because it divides the world into narrow zones (6 degrees of longitude wide), each optimized to minimize distortion within that specific area. This makes it highly accurate for measurements within a single zone, making it a popular choice for city and local-scale analyses. Lambert Conformal Conic can be used for larger regional areas (e.g., across the US), as it preserves shapes well over broad, east-west-oriented regions and minimizes distortion over large areas, which makes it suitable for national or regional mapping across multiple UTM zones.

When you create a spatial dataset using the sf package, you’ll specify a CRS to ensure that your data aligns correctly with any maps or layers you add. This is essential because it allows ggmap to plot your data in the correct location and ensures accurate distance measurements. Without a proper projection, spatial calculations such as distances and areas may be distorted.

CRSs can also be referred to by standardized codes. The EPSG code is a common way to refer to CRSs by number, such as EPSG:4326 for WGS 84 or EPSG:32610 for UTM Zone 10N. Another standardized format is the proj4 string, which describes projections in a text format (see https://epsg.io/28992).

The Mercer dataset includes three location types:

  • latitude and longitude in decimal degrees
  • lat_m and long_m, which may be in UTM
  • lambert_x and lambert_y, in meters using the USA_Contiguous_Lambert_Conformal_Conic projection

The sf package simplifies distance calculations by allowing projection settings directly within R objects. Lambert coordinates are ideal for distance calculations as they are in meters, suitable for a flat-surface model. See this New Zealand government resource for Lambert projection details.

Additional Resources: - Overview of coordinate reference systems - Nick Eubank’s guide on Projections and Coordinate Reference Systems

R Packages for Geostatistics and Spatial Data

For kriging, the gstat package is is newer and compatible with sf classes.

For additional guidance, Bivand’s book and other online resources provide excellent gstat examples for kriging.

Notes on Universal Kriging and Prediction

In kriging, **you cannot predict at locations with observations*. Predictions at the same locations used to estimate parameters will yield the observed values due to perfect self-correlation, as noted in this StackOverflow discussion. This property underscores the importance of cross-validation for reliable predictions. The gstat package also allows smoothing instead of kriging by specifying an “Err” variogram instead of a “Nug” nugget effect.

Brief Discussion of Variograms

Semivariance is defined as half the average of squared differences between all points separated by a given distance, \(h\). A variogram plots these squared differences as a function of distance, either showing all points as a “cloud” or using an averaged, “binned” version. Empirical variograms help reveal spatial structure by showing how semivariance changes with distance. Typically, semivariance increases with distance until it reaches a plateau, indicating spatial dependence.

In geostatistics, we model the structure seen in an empirical variogram with an assumed model, such as exponential, spherical, Gaussian, or Matern, to approximate spatial relationships. This overview offers a summary of semivariance and variograms.

Terminology can be confusing, with “variogram” and “semivariogram” often used interchangeably. This paper explains the terms and how semivariance relates to standard variance by showing that a variogram is a re-expression of variance as a function of distance between points.

Application

Overview

  1. Convert Data to Spatial Format: Transform the dataset into spatial (sf) format to enable spatial analyses.

  2. Estimate a Variogram: Calculate and fit a variogram to model spatial structure. The variogram parameterizes the spatial correlation (structured error) in our kriging model, describing how the variable of interest (Y) varies over space. For universal kriging (UK), this includes the effect of covariates in the mean model.

  3. Fit a Model and Perform Cross-Validation: Use cross-validation to evaluate the accuracy of the kriging model, testing the model’s predictive performance.

  4. Predict at New Locations: Apply the fitted kriging model to predict values at new locations.

  5. Map the Predictions: Visualize the kriging predictions on a map.

Summary & learn about the data

# ----- basics ----- 

head(snapshot)
# A tibble: 6 × 81
  latitude longitude    ID   lat_m  long_m   nox   no2    no ln_nox ln_no2 ln_no
     <dbl>     <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl>
1     33.9     -118.   115  3.76e6 -1.09e7 59.8  22.1  37.7    4.09   3.09 3.63 
2     33.9     -118.   119  3.76e6 -1.09e7 47.2  21.6  25.5    3.85   3.07 3.24 
3     33.9     -118.   118  3.76e6 -1.09e7 49.7  20.7  29.0    3.91   3.03 3.37 
4     33.9     -118.   110  3.76e6 -1.09e7  5.44  4.02  1.42   1.69   1.39 0.349
5     33.9     -118.   114  3.76e6 -1.09e7 52.4  24.8  27.6    3.96   3.21 3.32 
6     33.9     -118.   112  3.76e6 -1.09e7 48.8  24.9  23.9    3.89   3.21 3.18 
# ℹ 70 more variables: Pop_500 <dbl>, Pop_1000 <dbl>, Pop_1500 <dbl>,
#   Pop_2000 <dbl>, Pop_2500 <dbl>, Pop_3000 <dbl>, Pop_5000 <dbl>,
#   Pop_10000 <dbl>, Pop_15000 <dbl>, Int_50 <dbl>, Int_100 <dbl>,
#   Int_150 <dbl>, Int_300 <dbl>, Int_500 <dbl>, Int_750 <dbl>, Int_1000 <dbl>,
#   Int_1500 <dbl>, Int_2000 <dbl>, Int_3000 <dbl>, Open_50 <dbl>,
#   Open_100 <dbl>, Open_150 <dbl>, Open_300 <dbl>, Open_500 <dbl>,
#   Open_750 <dbl>, Open_1000 <dbl>, Open_1500 <dbl>, Open_2000 <dbl>, …
# glimpse and str are both useful to learn the structure.  I like glimpse from the `dplyr` package, particularly once this becomes converted to a spatial dataset
str(snapshot)
spc_tbl_ [449 × 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ latitude : num [1:449] 33.9 33.9 33.9 33.9 33.9 ...
 $ longitude: num [1:449] -118 -118 -118 -118 -118 ...
 $ ID       : num [1:449] 115 119 118 110 114 112 108 116 109 113 ...
 $ lat_m    : num [1:449] 3763473 3763473 3763506 3763506 3763506 ...
 $ long_m   : num [1:449] -10904174 -10904174 -10904072 -10904072 -10904072 ...
 $ nox      : num [1:449] 59.79 47.15 49.73 5.44 52.38 ...
 $ no2      : num [1:449] 22.07 21.61 20.71 4.02 24.81 ...
 $ no       : num [1:449] 37.73 25.54 29.02 1.42 27.56 ...
 $ ln_nox   : num [1:449] 4.09 3.85 3.91 1.69 3.96 ...
 $ ln_no2   : num [1:449] 3.09 3.07 3.03 1.39 3.21 ...
 $ ln_no    : num [1:449] 3.63 3.24 3.368 0.349 3.316 ...
 $ Pop_500  : num [1:449] 0.0231 0.0231 0.0282 0.0282 0.0282 ...
 $ Pop_1000 : num [1:449] 0.0865 0.0865 0.0966 0.0966 0.0966 ...
 $ Pop_1500 : num [1:449] 0.184 0.184 0.198 0.198 0.198 ...
 $ Pop_2000 : num [1:449] 0.3 0.3 0.318 0.318 0.318 ...
 $ Pop_2500 : num [1:449] 0.444 0.444 0.468 0.468 0.468 ...
 $ Pop_3000 : num [1:449] 0.625 0.625 0.651 0.651 0.651 ...
 $ Pop_5000 : num [1:449] 1.45 1.45 1.49 1.49 1.49 ...
 $ Pop_10000: num [1:449] 5.12 5.12 5.2 5.2 5.2 ...
 $ Pop_15000: num [1:449] 11.5 11.5 11.6 11.6 11.6 ...
 $ Int_50   : num [1:449] 0.0 0.0 2.3e-05 2.3e-05 2.3e-05 ...
 $ Int_100  : num [1:449] 0 0 0.000124 0.000124 0.000124 ...
 $ Int_150  : num [1:449] 3.05e-05 3.05e-05 3.04e-04 3.04e-04 3.04e-04 ...
 $ Int_300  : num [1:449] 0.000694 0.000694 0.001316 0.001316 0.001316 ...
 $ Int_500  : num [1:449] 0.00273 0.00273 0.0038 0.0038 0.0038 ...
 $ Int_750  : num [1:449] 0.00711 0.00711 0.00871 0.00871 0.00871 ...
 $ Int_1000 : num [1:449] 0.0135 0.0135 0.0157 0.0157 0.0157 ...
 $ Int_1500 : num [1:449] 0.0323 0.0323 0.0355 0.0355 0.0355 ...
 $ Int_2000 : num [1:449] 0.0604 0.0604 0.0647 0.0647 0.0647 ...
 $ Int_3000 : num [1:449] 0.139 0.139 0.145 0.145 0.145 ...
 $ Open_50  : num [1:449] 0.00734 0.00734 0.00553 0.00553 0.00553 ...
 $ Open_100 : num [1:449] 0.0232 0.0232 0.019 0.019 0.019 ...
 $ Open_150 : num [1:449] 0.0437 0.0437 0.04 0.04 0.04 ...
 $ Open_300 : num [1:449] 0.0949 0.0949 0.0934 0.0934 0.0934 ...
 $ Open_500 : num [1:449] 0.158 0.158 0.157 0.157 0.157 ...
 $ Open_750 : num [1:449] 0.232 0.232 0.231 0.231 0.231 ...
 $ Open_1000: num [1:449] 0.305 0.305 0.304 0.304 0.304 ...
 $ Open_1500: num [1:449] 0.453 0.453 0.451 0.451 0.451 ...
 $ Open_2000: num [1:449] 0.53 0.53 0.529 0.529 0.529 ...
 $ Open_3000: num [1:449] 0.67 0.67 0.669 0.669 0.669 ...
 $ D2C      : num [1:449] 0.00118 0.00118 0.00225 0.00225 0.00225 ...
 $ D2Comm   : num [1:449] 0.0537 0.0537 0.0431 0.0431 0.0431 ...
 $ D2Rroad  : num [1:449] 0.408 0.408 0.398 0.398 0.398 ...
 $ D2Ryard  : num [1:449] 1.51 1.51 1.5 1.5 1.5 ...
 $ D2airp   : num [1:449] 0.856 0.856 0.853 0.853 0.853 ...
 $ D2Mport  : num [1:449] 2.31 2.31 2.3 2.3 2.3 ...
 $ D2Lport  : num [1:449] 1.91 1.91 1.9 1.9 1.9 ...
 $ D2R      : num [1:449] 2.25 2.25 1.84 1.84 1.84 ...
 $ D2A1     : num [1:449] 3.67 3.67 3.66 3.66 3.66 ...
 $ D2A2     : num [1:449] 2.94 2.94 2.89 2.89 2.89 ...
 $ D2A3     : num [1:449] 2.25 2.25 1.84 1.84 1.84 ...
 $ A1_100   : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_150   : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_300   : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_400   : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_50    : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_500   : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_750   : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_1k    : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_3k    : num [1:449] 0 0 0 0 0 0 0 0 0 0 ...
 $ A1_5k    : num [1:449] 0.454 0.454 0.53 0.53 0.53 ...
 $ A1_10k   : num [1:449] 4.78 4.78 4.86 4.86 4.86 ...
 $ A1_15k   : num [1:449] 17.5 17.5 17.7 17.7 17.7 ...
 $ A23_50   : num [1:449] 0 0 0 0 0 ...
 $ A23_100  : num [1:449] 0 0 0.0264 0.0264 0.0264 ...
 $ A23_150  : num [1:449] 0 0 0.0539 0.0539 0.0539 ...
 $ A23_300  : num [1:449] 0.105 0.105 0.121 0.121 0.121 ...
 $ A23_400  : num [1:449] 0.155 0.155 0.185 0.185 0.185 ...
 $ A23_500  : num [1:449] 0.258 0.258 0.284 0.284 0.284 ...
 $ A23_750  : num [1:449] 0.574 0.574 0.659 0.659 0.659 ...
 $ A23_1k   : num [1:449] 1.05 1.05 1.25 1.25 1.25 ...
 $ A23_5k   : num [1:449] 13.8 13.8 14.1 14.1 14.1 ...
 $ A23_10k  : num [1:449] 46.1 46.1 46.5 46.5 46.5 ...
 $ A23_15k  : num [1:449] 98.2 98.2 98.8 98.8 98.8 ...
 $ FieldID  : num [1:449] 151 127 126 147 150 148 145 124 146 149 ...
 $ cluster  : num [1:449] 2 2 2 2 2 2 2 2 2 2 ...
 $ group_loc: num [1:449] 26 33 33 27 26 26 27 33 27 26 ...
 $ season   : num [1:449] 2 3 3 1 2 2 1 3 1 2 ...
 $ seasonfac: chr [1:449] "2Fall" "3Winter" "3Winter" "1Summer" ...
 $ lambert_x: num [1:449] -2049080 -2049080 -2048974 -2048974 -2048974 ...
 $ lambert_y: num [1:449] -313567 -313567 -313560 -313560 -313560 ...
 - attr(*, "spec")=
  .. cols(
  ..   latitude = col_double(),
  ..   longitude = col_double(),
  ..   ID = col_double(),
  ..   lat_m = col_double(),
  ..   long_m = col_double(),
  ..   nox = col_double(),
  ..   no2 = col_double(),
  ..   no = col_double(),
  ..   ln_nox = col_double(),
  ..   ln_no2 = col_double(),
  ..   ln_no = col_double(),
  ..   Pop_500 = col_double(),
  ..   Pop_1000 = col_double(),
  ..   Pop_1500 = col_double(),
  ..   Pop_2000 = col_double(),
  ..   Pop_2500 = col_double(),
  ..   Pop_3000 = col_double(),
  ..   Pop_5000 = col_double(),
  ..   Pop_10000 = col_double(),
  ..   Pop_15000 = col_double(),
  ..   Int_50 = col_double(),
  ..   Int_100 = col_double(),
  ..   Int_150 = col_double(),
  ..   Int_300 = col_double(),
  ..   Int_500 = col_double(),
  ..   Int_750 = col_double(),
  ..   Int_1000 = col_double(),
  ..   Int_1500 = col_double(),
  ..   Int_2000 = col_double(),
  ..   Int_3000 = col_double(),
  ..   Open_50 = col_double(),
  ..   Open_100 = col_double(),
  ..   Open_150 = col_double(),
  ..   Open_300 = col_double(),
  ..   Open_500 = col_double(),
  ..   Open_750 = col_double(),
  ..   Open_1000 = col_double(),
  ..   Open_1500 = col_double(),
  ..   Open_2000 = col_double(),
  ..   Open_3000 = col_double(),
  ..   D2C = col_double(),
  ..   D2Comm = col_double(),
  ..   D2Rroad = col_double(),
  ..   D2Ryard = col_double(),
  ..   D2airp = col_double(),
  ..   D2Mport = col_double(),
  ..   D2Lport = col_double(),
  ..   D2R = col_double(),
  ..   D2A1 = col_double(),
  ..   D2A2 = col_double(),
  ..   D2A3 = col_double(),
  ..   A1_100 = col_double(),
  ..   A1_150 = col_double(),
  ..   A1_300 = col_double(),
  ..   A1_400 = col_double(),
  ..   A1_50 = col_double(),
  ..   A1_500 = col_double(),
  ..   A1_750 = col_double(),
  ..   A1_1k = col_double(),
  ..   A1_3k = col_double(),
  ..   A1_5k = col_double(),
  ..   A1_10k = col_double(),
  ..   A1_15k = col_double(),
  ..   A23_50 = col_double(),
  ..   A23_100 = col_double(),
  ..   A23_150 = col_double(),
  ..   A23_300 = col_double(),
  ..   A23_400 = col_double(),
  ..   A23_500 = col_double(),
  ..   A23_750 = col_double(),
  ..   A23_1k = col_double(),
  ..   A23_5k = col_double(),
  ..   A23_10k = col_double(),
  ..   A23_15k = col_double(),
  ..   FieldID = col_double(),
  ..   cluster = col_double(),
  ..   group_loc = col_double(),
  ..   season = col_double(),
  ..   seasonfac = col_character(),
  ..   lambert_x = col_double(),
  ..   lambert_y = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 
glimpse(snapshot)
Rows: 449
Columns: 81
$ latitude  <dbl> 33.8655, 33.8655, 33.8658, 33.8658, 33.8658, 33.8661, 33.866…
$ longitude <dbl> -118.4030, -118.4030, -118.4019, -118.4019, -118.4019, -118.…
$ ID        <dbl> 115, 119, 118, 110, 114, 112, 108, 116, 109, 113, 117, 107, …
$ lat_m     <dbl> 3763473, 3763473, 3763506, 3763506, 3763506, 3763540, 376354…
$ long_m    <dbl> -10904174, -10904174, -10904072, -10904072, -10904072, -1090…
$ nox       <dbl> 59.792795, 47.151740, 49.734201, 5.443071, 52.377735, 48.845…
$ no2       <dbl> 22.066961, 21.608660, 20.710006, 4.024763, 24.814801, 24.897…
$ no        <dbl> 37.725834, 25.543080, 29.024194, 1.418308, 27.562934, 23.948…
$ ln_nox    <dbl> 4.090885, 3.853371, 3.906693, 1.694343, 3.958482, 3.888660, …
$ ln_no2    <dbl> 3.094082, 3.073094, 3.030617, 1.392466, 3.211440, 3.214760, …
$ ln_no     <dbl> 3.6303451, 3.2403665, 3.3681297, 0.3494648, 3.3164718, 3.175…
$ Pop_500   <dbl> 0.02305582, 0.02305582, 0.02816441, 0.02816441, 0.02816441, …
$ Pop_1000  <dbl> 0.08647152, 0.08647152, 0.09656532, 0.09656532, 0.09656532, …
$ Pop_1500  <dbl> 0.1844404, 0.1844404, 0.1979028, 0.1979028, 0.1979028, 0.219…
$ Pop_2000  <dbl> 0.3003615, 0.3003615, 0.3181143, 0.3181143, 0.3181143, 0.346…
$ Pop_2500  <dbl> 0.4442103, 0.4442103, 0.4677484, 0.4677484, 0.4677484, 0.505…
$ Pop_3000  <dbl> 0.6247332, 0.6247332, 0.6511513, 0.6511513, 0.6511513, 0.690…
$ Pop_5000  <dbl> 1.450194, 1.450194, 1.485830, 1.485830, 1.485830, 1.543852, …
$ Pop_10000 <dbl> 5.124360, 5.124360, 5.203453, 5.203453, 5.203453, 5.328547, …
$ Pop_15000 <dbl> 11.49131, 11.49131, 11.60518, 11.60518, 11.60518, 11.77500, …
$ Int_50    <dbl> 0.000000e+00, 0.000000e+00, 2.300420e-05, 2.300420e-05, 2.30…
$ Int_100   <dbl> 0.0000000000, 0.0000000000, 0.0001240698, 0.0001240698, 0.00…
$ Int_150   <dbl> 3.049229e-05, 3.049229e-05, 3.037784e-04, 3.037784e-04, 3.03…
$ Int_300   <dbl> 0.0006943842, 0.0006943842, 0.0013164949, 0.0013164949, 0.00…
$ Int_500   <dbl> 0.002733914, 0.002733914, 0.003795421, 0.003795421, 0.003795…
$ Int_750   <dbl> 0.007112461, 0.007112461, 0.008714295, 0.008714295, 0.008714…
$ Int_1000  <dbl> 0.01351628, 0.01351628, 0.01565563, 0.01565563, 0.01565563, …
$ Int_1500  <dbl> 0.03227645, 0.03227645, 0.03548815, 0.03548815, 0.03548815, …
$ Int_2000  <dbl> 0.06044282, 0.06044282, 0.06471332, 0.06471332, 0.06471332, …
$ Int_3000  <dbl> 0.1389744, 0.1389744, 0.1453382, 0.1453382, 0.1453382, 0.155…
$ Open_50   <dbl> 0.007337417, 0.007337417, 0.005532907, 0.005532907, 0.005532…
$ Open_100  <dbl> 0.02319101, 0.02319101, 0.01896796, 0.01896796, 0.01896796, …
$ Open_150  <dbl> 0.043685260, 0.043685260, 0.040038149, 0.040038149, 0.040038…
$ Open_300  <dbl> 0.094913329, 0.094913329, 0.093430782, 0.093430782, 0.093430…
$ Open_500  <dbl> 0.1576800, 0.1576800, 0.1565374, 0.1565374, 0.1565374, 0.139…
$ Open_750  <dbl> 0.2318343, 0.2318343, 0.2305649, 0.2305649, 0.2305649, 0.219…
$ Open_1000 <dbl> 0.3051261, 0.3051261, 0.3039934, 0.3039934, 0.3039934, 0.295…
$ Open_1500 <dbl> 0.4525185, 0.4525185, 0.4510480, 0.4510480, 0.4510480, 0.444…
$ Open_2000 <dbl> 0.5295536, 0.5295536, 0.5287640, 0.5287640, 0.5287640, 0.523…
$ Open_3000 <dbl> 0.6701055, 0.6701055, 0.6689487, 0.6689487, 0.6689487, 0.663…
$ D2C       <dbl> 0.00117531, 0.00117531, 0.00224596, 0.00224596, 0.00224596, …
$ D2Comm    <dbl> 0.0537268, 0.0537268, 0.0430847, 0.0430847, 0.0430847, 0.026…
$ D2Rroad   <dbl> 0.4084856, 0.4084856, 0.3982672, 0.3982672, 0.3982672, 0.381…
$ D2Ryard   <dbl> 1.505729, 1.505729, 1.500410, 1.500410, 1.500410, 1.490392, …
$ D2airp    <dbl> 0.8558056, 0.8558056, 0.8531033, 0.8531033, 0.8531033, 0.851…
$ D2Mport   <dbl> 2.310610, 2.310610, 2.303232, 2.303232, 2.303232, 2.290198, …
$ D2Lport   <dbl> 1.910131, 1.910131, 1.904829, 1.904829, 1.904829, 1.894813, …
$ D2R       <dbl> 2.247499, 2.247499, 1.844452, 1.844452, 1.844452, 1.926913, …
$ D2A1      <dbl> 3.667007, 3.667007, 3.658159, 3.658159, 3.658159, 3.644959, …
$ D2A2      <dbl> 2.944537, 2.944537, 2.888423, 2.888423, 2.888423, 2.780802, …
$ D2A3      <dbl> 2.247499, 2.247499, 1.844452, 1.844452, 1.844452, 1.926913, …
$ A1_100    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, …
$ A1_150    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, …
$ A1_300    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000…
$ A1_400    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, …
$ A1_50     <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000000000, 0.000000…
$ A1_500    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000…
$ A1_750    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000…
$ A1_1k     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000…
$ A1_3k     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ A1_5k     <dbl> 0.4538663, 0.4538663, 0.5296068, 0.5296068, 0.5296068, 0.636…
$ A1_10k    <dbl> 4.775270, 4.775270, 4.860473, 4.860473, 4.860473, 4.989863, …
$ A1_15k    <dbl> 17.52115, 17.52115, 17.69218, 17.69218, 17.69218, 17.94655, …
$ A23_50    <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000000000, 0.000000…
$ A23_100   <dbl> 0.00000000, 0.00000000, 0.02640600, 0.02640600, 0.02640600, …
$ A23_150   <dbl> 0.00000000, 0.00000000, 0.05390069, 0.05390069, 0.05390069, …
$ A23_300   <dbl> 0.10518606, 0.10518606, 0.12071717, 0.12071717, 0.12071717, …
$ A23_400   <dbl> 0.15459495, 0.15459495, 0.18546583, 0.18546583, 0.18546583, …
$ A23_500   <dbl> 0.2576134, 0.2576134, 0.2837931, 0.2837931, 0.2837931, 0.436…
$ A23_750   <dbl> 0.5742651, 0.5742651, 0.6591905, 0.6591905, 0.6591905, 0.805…
$ A23_1k    <dbl> 1.050208, 1.050208, 1.248160, 1.248160, 1.248160, 1.472237, …
$ A23_5k    <dbl> 13.78960, 13.78960, 14.06955, 14.06955, 14.06955, 14.49149, …
$ A23_10k   <dbl> 46.12615, 46.12615, 46.49810, 46.49810, 46.49810, 47.06752, …
$ A23_15k   <dbl> 98.22387, 98.22387, 98.82735, 98.82735, 98.82735, 99.99205, …
$ FieldID   <dbl> 151, 127, 126, 147, 150, 148, 145, 124, 146, 149, 125, 144, …
$ cluster   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, …
$ group_loc <dbl> 26, 33, 33, 27, 26, 26, 27, 33, 27, 26, 33, 27, 33, 26, 34, …
$ season    <dbl> 2, 3, 3, 1, 2, 2, 1, 3, 1, 2, 3, 1, 3, 2, 3, 2, 1, 2, 3, 1, …
$ seasonfac <chr> "2Fall", "3Winter", "3Winter", "1Summer", "2Fall", "2Fall", …
$ lambert_x <dbl> -2049080, -2049080, -2048974, -2048974, -2048974, -2048804, …
$ lambert_y <dbl> -313566.9, -313566.9, -313559.5, -313559.5, -313559.5, -3135…
# summary of the data
summary(snapshot) 
    latitude       longitude            ID             lat_m        
 Min.   :33.87   Min.   :-118.5   Min.   :  1.00   Min.   :3763473  
 1st Qu.:33.97   1st Qu.:-118.3   1st Qu.: 38.00   1st Qu.:3774597  
 Median :34.03   Median :-118.3   Median : 75.00   Median :3782232  
 Mean   :34.03   Mean   :-118.2   Mean   : 75.34   Mean   :3781544  
 3rd Qu.:34.09   3rd Qu.:-118.2   3rd Qu.:113.00   3rd Qu.:3788433  
 Max.   :34.20   Max.   :-117.8   Max.   :152.00   Max.   :3801091  
     long_m               nox               no2                no          
 Min.   :-10909524   Min.   :  5.443   Min.   : 0.8232   Min.   :  0.6999  
 1st Qu.:-10897745   1st Qu.: 39.577   1st Qu.:25.2658   1st Qu.: 11.9666  
 Median :-10890350   Median : 63.429   Median :30.6104   Median : 35.0662  
 Mean   :-10887982   Mean   : 68.327   Mean   :29.4747   Mean   : 38.8519  
 3rd Qu.:-10881841   3rd Qu.: 92.011   3rd Qu.:34.3679   3rd Qu.: 59.3239  
 Max.   :-10847914   Max.   :164.199   Max.   :47.0383   Max.   :122.2598  
     ln_nox          ln_no2            ln_no            Pop_500        
 Min.   :1.694   Min.   :-0.1945   Min.   :-0.3568   Min.   :0.001975  
 1st Qu.:3.678   1st Qu.: 3.2295   1st Qu.: 2.4821   1st Qu.:0.018508  
 Median :4.150   Median : 3.4213   Median : 3.5572   Median :0.031940  
 Mean   :4.087   Mean   : 3.3371   Mean   : 3.3125   Mean   :0.038090  
 3rd Qu.:4.522   3rd Qu.: 3.5371   3rd Qu.: 4.0830   3rd Qu.:0.056955  
 Max.   :5.101   Max.   : 3.8510   Max.   : 4.8061   Max.   :0.080327  
    Pop_1000          Pop_1500         Pop_2000         Pop_2500     
 Min.   :0.02917   Min.   :0.1011   Min.   :0.1851   Min.   :0.2737  
 1st Qu.:0.08311   1st Qu.:0.1966   1st Qu.:0.3461   1st Qu.:0.5513  
 Median :0.11613   Median :0.2734   Median :0.4913   Median :0.7935  
 Mean   :0.14391   Mean   :0.3178   Mean   :0.5555   Mean   :0.8459  
 3rd Qu.:0.20179   3rd Qu.:0.4153   3rd Qu.:0.7024   3rd Qu.:1.0255  
 Max.   :0.33947   Max.   :0.8120   Max.   :1.3647   Max.   :2.1545  
    Pop_3000         Pop_5000        Pop_10000        Pop_15000     
 Min.   :0.3819   Min.   :0.8141   Min.   : 4.864   Min.   : 9.673  
 1st Qu.:0.8068   1st Qu.:2.0973   1st Qu.: 7.264   1st Qu.:15.247  
 Median :1.0928   Median :2.8316   Median :11.026   Median :24.309  
 Mean   :1.1878   Mean   :3.0729   Mean   :10.657   Mean   :22.232  
 3rd Qu.:1.4412   3rd Qu.:4.0804   3rd Qu.:13.961   3rd Qu.:28.052  
 Max.   :2.8790   Max.   :6.3373   Max.   :18.116   Max.   :33.686  
     Int_50             Int_100             Int_150             Int_300        
 Min.   :0.000e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:7.833e-05   1st Qu.:0.0003137   1st Qu.:0.0007062   1st Qu.:0.002826  
 Median :7.833e-05   Median :0.0003137   Median :0.0007062   Median :0.002826  
 Mean   :7.587e-05   Mean   :0.0003033   Mean   :0.0006821   Mean   :0.002714  
 3rd Qu.:7.833e-05   3rd Qu.:0.0003137   3rd Qu.:0.0007062   3rd Qu.:0.002826  
 Max.   :7.833e-05   Max.   :0.0003140   Max.   :0.0007066   Max.   :0.002827  
    Int_500             Int_750            Int_1000          Int_1500      
 Min.   :0.0004047   Min.   :0.003582   Min.   :0.00873   Min.   :0.02304  
 1st Qu.:0.0078394   1st Qu.:0.016758   1st Qu.:0.02940   1st Qu.:0.06324  
 Median :0.0078519   Median :0.017667   Median :0.03095   Median :0.06774  
 Mean   :0.0074804   Mean   :0.016625   Mean   :0.02917   Mean   :0.06440  
 3rd Qu.:0.0078519   3rd Qu.:0.017668   3rd Qu.:0.03141   3rd Qu.:0.06947  
 Max.   :0.0078532   Max.   :0.017670   Max.   :0.03141   Max.   :0.07068  
    Int_2000         Int_3000          Open_50             Open_100        
 Min.   :0.0426   Min.   :0.09684   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.1131   1st Qu.:0.24446   1st Qu.:0.0000000   1st Qu.:0.0000000  
 Median :0.1197   Median :0.25963   Median :0.0000000   Median :0.0000000  
 Mean   :0.1129   Mean   :0.24632   Mean   :0.0001892   Mean   :0.0007449  
 3rd Qu.:0.1224   3rd Qu.:0.26807   3rd Qu.:0.0000000   3rd Qu.:0.0000000  
 Max.   :0.1257   Max.   :0.28166   Max.   :0.0078333   Max.   :0.0313749  
    Open_150           Open_300           Open_500          Open_750      
 Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.000000   Median :0.000000   Median :0.00000   Median :0.00000  
 Mean   :0.001718   Mean   :0.007023   Mean   :0.01873   Mean   :0.04351  
 3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :0.070624   Max.   :0.282618   Max.   :0.74472   Max.   :1.40862  
   Open_1000         Open_1500         Open_2000        Open_3000       
 Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   : 0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 0.00000  
 Median :0.00000   Median :0.00000   Median :0.0000   Median : 0.06987  
 Mean   :0.08641   Mean   :0.26005   Mean   :0.5343   Mean   : 1.61086  
 3rd Qu.:0.00000   3rd Qu.:0.07615   3rd Qu.:0.2485   3rd Qu.: 1.62485  
 Max.   :2.25284   Max.   :4.64466   Max.   :7.8925   Max.   :16.57777  
      D2C               D2Comm            D2Rroad            D2Ryard       
 Min.   :0.001175   Min.   :0.000000   Min.   :0.001648   Min.   :0.01708  
 1st Qu.:0.118063   1st Qu.:0.009005   1st Qu.:0.060380   1st Qu.:0.38064  
 Median :0.150000   Median :0.022972   Median :0.124037   Median :0.77699  
 Mean   :0.124338   Mean   :0.026978   Mean   :0.189561   Mean   :0.78278  
 3rd Qu.:0.150000   3rd Qu.:0.041274   3rd Qu.:0.310072   3rd Qu.:1.04310  
 Max.   :0.150000   Max.   :0.086601   Max.   :0.710285   Max.   :1.86656  
     D2airp           D2Mport         D2Lport           D2R       
 Min.   :0.06895   Min.   :1.364   Min.   :1.705   Min.   :1.000  
 1st Qu.:0.47971   1st Qu.:2.311   1st Qu.:2.393   1st Qu.:1.700  
 Median :0.84930   Median :2.500   Median :2.500   Median :2.011  
 Mean   :0.84860   Mean   :2.390   Mean   :2.392   Mean   :1.977  
 3rd Qu.:1.19120   3rd Qu.:2.500   3rd Qu.:2.500   3rd Qu.:2.343  
 Max.   :1.63273   Max.   :2.500   Max.   :2.500   Max.   :2.695  
      D2A1            D2A2            D2A3           A1_100        
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.000000  
 1st Qu.:2.498   1st Qu.:2.797   1st Qu.:1.826   1st Qu.:0.000000  
 Median :3.003   Median :3.471   Median :2.168   Median :0.000000  
 Mean   :2.882   Mean   :3.302   Mean   :2.130   Mean   :0.003396  
 3rd Qu.:3.358   3rd Qu.:3.788   3rd Qu.:2.517   3rd Qu.:0.000000  
 Max.   :3.784   Max.   :4.110   Max.   :2.834   Max.   :0.069835  
     A1_150             A1_300            A1_400            A1_50          
 Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000000  
 Median :0.000000   Median :0.00000   Median :0.00000   Median :0.0000000  
 Mean   :0.007723   Mean   :0.02705   Mean   :0.04686   Mean   :0.0006208  
 3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.09734   3rd Qu.:0.0000000  
 Max.   :0.113516   Max.   :0.23686   Max.   :0.31762   Max.   :0.0188561  
     A1_500            A1_750           A1_1k            A1_3k      
 Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
 1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.159  
 Median :0.00000   Median :0.0000   Median :0.0000   Median :1.500  
 Mean   :0.06553   Mean   :0.1162   Mean   :0.1927   Mean   :1.433  
 3rd Qu.:0.15390   3rd Qu.:0.2783   3rd Qu.:0.3835   3rd Qu.:2.012  
 Max.   :0.39807   Max.   :0.5988   Max.   :0.7991   Max.   :4.671  
     A1_5k            A1_10k           A1_15k          A23_50        
 Min.   : 0.000   Min.   : 4.775   Min.   :12.61   Min.   :0.000000  
 1st Qu.: 2.783   1st Qu.:13.305   1st Qu.:25.48   1st Qu.:0.000000  
 Median : 4.275   Median :16.398   Median :35.52   Median :0.000000  
 Mean   : 4.279   Mean   :17.029   Mean   :34.63   Mean   :0.001774  
 3rd Qu.: 5.802   3rd Qu.:21.757   3rd Qu.:43.84   3rd Qu.:0.000000  
 Max.   :11.023   Max.   :28.080   Max.   :52.26   Max.   :0.019433  
    A23_100            A23_150           A23_300            A23_400       
 Min.   :0.000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
 1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.006448   1st Qu.:0.07667  
 Median :0.000000   Median :0.01947   Median :0.058893   Median :0.10349  
 Mean   :0.008599   Mean   :0.02019   Mean   :0.061969   Mean   :0.11831  
 3rd Qu.:0.017309   3rd Qu.:0.02991   3rd Qu.:0.109842   3rd Qu.:0.15803  
 Max.   :0.039698   Max.   :0.07825   Max.   :0.197963   Max.   :0.35228  
    A23_500          A23_750           A23_1k           A23_5k      
 Min.   :0.0000   Min.   :0.1499   Min.   :0.2244   Min.   : 7.149  
 1st Qu.:0.1090   1st Qu.:0.3112   1st Qu.:0.5599   1st Qu.:14.491  
 Median :0.1842   Median :0.4357   Median :0.7468   Median :16.138  
 Mean   :0.1902   Mean   :0.4284   Mean   :0.7503   Mean   :16.132  
 3rd Qu.:0.2488   3rd Qu.:0.5095   3rd Qu.:0.8818   3rd Qu.:17.615  
 Max.   :0.5220   Max.   :1.0443   Max.   :1.5796   Max.   :22.278  
    A23_10k         A23_15k          FieldID          cluster      
 Min.   :32.59   Min.   : 78.11   Min.   :  1.00   Min.   : 1.000  
 1st Qu.:53.45   1st Qu.:104.33   1st Qu.: 38.00   1st Qu.: 3.000  
 Median :65.36   Median :136.35   Median : 75.00   Median : 5.000  
 Mean   :61.88   Mean   :130.67   Mean   : 75.34   Mean   : 5.116  
 3rd Qu.:70.94   3rd Qu.:152.88   3rd Qu.:113.00   3rd Qu.: 8.000  
 Max.   :76.55   Max.   :170.59   Max.   :152.00   Max.   :10.000  
   group_loc         season       seasonfac           lambert_x       
 Min.   : 1.00   Min.   :1.000   Length:449         Min.   :-2049958  
 1st Qu.: 7.00   1st Qu.:1.000   Class :character   1st Qu.:-2040532  
 Median :17.00   Median :2.000   Mode  :character   Median :-2031533  
 Mean   :17.22   Mean   :2.002                      Mean   :-2028963  
 3rd Qu.:27.00   3rd Qu.:3.000                      3rd Qu.:-2021439  
 Max.   :43.00   Max.   :3.000                      Max.   :-1989855  
   lambert_y      
 Min.   :-316187  
 1st Qu.:-308020  
 Median :-301658  
 Mean   :-300031  
 3rd Qu.:-294015  
 Max.   :-281058  

Transform Data to Spatial (sf) Objects

Read the snapshot data as an sf object. We’ll define the coordinate systems for later use and focus on fall season data. Summarize the dataset and observe the coordinate range and dataset features (e.g., included covariates).

#---- Read fall snapshot as an sf object -----

# Define coordinate systems using EPSG codes. Here we use numbers and strings as an example.  
# WGS84 latitude-longitude. same as: "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
latlong_proj <- 4326  
# projection in meters we need for distance calculations
lambert_proj <- "+proj=lcc +lat_1=33 +lat_2=45 +lat_0=39 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs" 


# Filter for fall season and select relevant covariates
fall <- snapshot %>%
    filter(seasonfac == "2Fall") %>%
    select(ID, latitude, longitude, ln_nox, D2A1, A1_50, A23_400, Pop_5000, D2C, Int_3000, D2Comm, cluster, group_loc, FieldID) %>%
    as.data.frame()

# Convert to sf object, specifying coordinate reference system (CRS)
fall <- st_as_sf(fall, coords = c('longitude', 'latitude'), crs = latlong_proj)

# Summarize and view the structure
fall
Simple feature collection with 152 features and 12 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -118.4611 ymin: 33.8655 xmax: -117.7921 ymax: 34.204
Geodetic CRS:  WGS 84
First 10 features:
    ID   ln_nox     D2A1       A1_50    A23_400 Pop_5000        D2C  Int_3000
1  115 4.090885 3.667007 0.000000000 0.15459495 1.450194 0.00117531 0.1389744
2  114 3.958482 3.658159 0.000000000 0.18546583 1.485830 0.00224596 0.1453382
3  112 3.888660 3.644959 0.000000000 0.25308072 1.543852 0.00392796 0.1550307
4  113 3.928215 3.647981 0.000000000 0.21802194 1.527761 0.00349128 0.1525681
5  111 3.972750 3.635065 0.000000000 0.35227517 1.592584 0.00526450 0.1626672
6  121 4.362086 2.537895 0.000000000 0.15552800 2.906038 0.05123766 0.2654660
7  120 4.381691 1.878206 0.000000000 0.10671729 2.907382 0.05373047 0.2600416
8  119 4.394589 1.506329 0.007704811 0.08137483 2.904599 0.05420517 0.2592529
9  118 4.563537 1.707085 0.000000000 0.02626846 2.904547 0.05519852 0.2573964
10 117 4.437409 1.856009 0.000000000 0.03109334 2.906526 0.05535398 0.2570769
      D2Comm cluster group_loc FieldID                  geometry
1  0.0537268       2        26     151  POINT (-118.403 33.8655)
2  0.0430847       2        26     150 POINT (-118.4019 33.8658)
3  0.0261000       2        26     148 POINT (-118.4001 33.8661)
4  0.0306580       2        26     149 POINT (-118.4006 33.8661)
5  0.0123465       2        26     147 POINT (-118.3986 33.8662)
6  0.0000000       3        27     141 POINT (-118.3515 33.8794)
7  0.0194984       3        27     142  POINT (-118.3493 33.881)
8  0.0241388       3        27     143 POINT (-118.3488 33.8811)
9  0.0325544       3        27     144 POINT (-118.3479 33.8817)
10 0.0335094       3        27     145 POINT (-118.3478 33.8819)
# Preview the first 10 raw XY (long-lat) coordinates
st_coordinates(fall) %>% head(10)
              X       Y
 [1,] -118.4030 33.8655
 [2,] -118.4019 33.8658
 [3,] -118.4001 33.8661
 [4,] -118.4006 33.8661
 [5,] -118.3986 33.8662
 [6,] -118.3515 33.8794
 [7,] -118.3493 33.8810
 [8,] -118.3488 33.8811
 [9,] -118.3479 33.8817
[10,] -118.3478 33.8819

For later use, convert the Los Angeles grid to sf points and remove points over water to focus on land locations only.

#----- Convert LA grid to sf -----

# Check initial class of la_grid
class(la_grid)
[1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 
# Filter out rows with -Inf in D2A1, remove redundant lambert columns, and convert to sf
la_grid <- la_grid %>%
  filter(D2A1 != -Inf) %>%
  select(-lambert_x, -lambert_y) %>%
  st_as_sf(coords = c('longitude', 'latitude'), crs = latlong_proj)

# Verify class and summary of the converted object
class(la_grid)
[1] "sf"         "tbl_df"     "tbl"        "data.frame"
la_grid
Simple feature collection with 18711 features and 8 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -118.4985 ymin: 33.6969 xmax: -117.5485 ymax: 34.1819
Geodetic CRS:  WGS 84
# A tibble: 18,711 × 9
   native_id      D2A1 A1_50 A23_400 Pop_5000     D2C D2Comm Int_3000
 * <chr>         <dbl> <dbl>   <dbl>    <dbl>   <dbl>  <dbl>    <dbl>
 1 grid_la_4444   3.19     0  0.118     1.95  0.0388  0.132  0.247   
 2 grid_la_8551   3.73     0  0         0.336 0.15    0.348  0.00344 
 3 grid_la_9643   3.67     0  0.0976    1.99  0.15    0.0581 0.270   
 4 grid_la_2319   3.85     0  0         0.626 0.0195  0.0236 0.154   
 5 grid_la_10320  3.65     0  0         0.908 0.0114  0.184  0.0495  
 6 grid_la_7622   2.88     0  0         0.399 0.15    0.241  0.0145  
 7 grid_la_640    3.72     0  0         0     0.0281  0.547  0.000152
 8 grid_la_423    3.67     0  0.0487    0.725 0.00323 0.03   0.0953  
 9 grid_la_7870   3.30     0  0.171     3.01  0.0972  0      0.251   
10 grid_la_1972   2.88     0  0         1.38  0.00216 0.152  0.0866  
# ℹ 18,701 more rows
# ℹ 1 more variable: geometry <POINT [°]>
# Download the land polygon data as an sf multipolygon
land <- ne_download(scale = "large", type = "land", category = "physical", returnclass = "sf")
Reading layer `ne_10m_land' from data source 
  `/private/var/folders/0f/38191bhn40355b3djsnrcc880000gn/T/RtmpMX5xNb/ne_10m_land.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 11 features and 3 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
Geodetic CRS:  WGS 84
# Crop the land area to the bounding box of the LA grid to reduce processing time
land <- suppressWarnings(st_crop(land, st_bbox(la_grid)))

# Visualize cropped land area (optional)
ggplot(land) + geom_sf()

# Filter la_grid to keep only points that intersect with land
la_grid <- la_grid[st_within(la_grid, land) %>% lengths() > 0,]

# Visualize grid land locations (zoom in)
ggplot(la_grid) + geom_sf(size=0.001)

Visualize the Data

R offers various mapping options, many of which are continuously evolving. Some require API keys, but in this course, we’ll use some options that do not require an API, including ggspatial, maptiles, and terra.

Map Zoom: The zoom parameter controls the map scale. Values range from 1 (global view) to 21 (building level). For city-level detail, zoom levels around 10-12 are generally suitable.

Note: An active internet connection is required to load map tiles.

# ---- LA Map Setup ----
# Define a bounding box (min & max X and Y) with a 10,000m buffer around `la_grid`
map_bbox <- la_grid %>%
  # convert from degrees to meters
  st_transform(crs = lambert_proj) %>%
  # add a buffer around the area for visualization purposes
  st_buffer(dist = 10000) %>%
  # convert back to original CRS
  st_transform(crs = latlong_proj) %>%
  # take the min/max X/Y
  st_bbox()

map_bbox
      xmin       ymin       xmax       ymax 
-118.60716   33.61165 -117.43985   34.27221 
# Base map setup with ggplot2 and ggspatial using OSM tiles
g <- ggplot() +
  ggspatial::annotation_map_tile(type = "osm", zoom = 10) +
  labs(title = "LA Grid with Map") +
  theme_minimal()

# # alternative background map with the maptiles package
# tiles <- maptiles::get_tiles(x = st_bbox(la_grid), provider = "OpenStreetMap")
# g <- ggplot() +
#   # Add basemap tiles as background
#   layer_spatial(tiles)

# Plot background map and the LA grid (zoom in)
g + 
  geom_sf(data = la_grid, size=0.001)
Zoom: 10

# Add NOx data (transformed to original scale) with additional map elements
g + 
  geom_sf(data = fall, aes(color = exp(ln_nox))) + 
  scale_color_viridis_c() +  # Color-friendly scale
  theme_void() +  # Clean layout for map aesthetics
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  ) + 
  ggspatial::annotation_scale(location = "bl", width_hint = 0.3, unit_category = "imperial") +  # Scale in miles
  ggspatial::annotation_north_arrow(location = "tr", which_north = "true", style = north_arrow_fancy_orienteering) +  # North arrow
  labs(title = "Map of Los Angeles with the\nFall Snapshot Data",
       col="NOx (ppb)"
       )
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
Zoom: 10

Empirical (Data-Driven) Variograms

Variograms help us understand how our variable of interest (e.g., ln_nox) varies over space. The following code demonstrates empirical variograms using different methods:

  1. Variogram Cloud: Shows all squared distances (cloud = TRUE).
  2. Binned Variogram: The default view, which bins distances into groups.
  3. Binned Variogram with Point Counts: Displays the number of point pairs in each bin.
  4. Smoothed Cloud Variogram: Uses ggplot to overlay a smooth curve on the cloud variogram.

In gstat, the cutoff parameter controls the maximum distance used, with a default of 1/3 of the maximum possible distance. Here, we adjust the cutoff in the example.

For details on semi-variance (gamma) and distance, refer to this helpful document: An Introduction to (Geo)statistics with R, also available on the course Canvas site.

# ---- Empirical Variogram Plots ----

# Variogram Cloud
vgm_fall <- variogram(ln_nox ~ 1, fall, cloud = TRUE)

plot(vgm_fall)

# Binned Variogram (default)
plot(variogram(ln_nox ~ 1, fall))

# Binned Variogram with Point Counts
plot(variogram(ln_nox ~ 1, fall), pl = TRUE)

# Smoothed Cloud Variogram using ggplot2
ggplot(data = vgm_fall, aes(x = dist, y = gamma)) +
  geom_point(shape = 1, alpha = 0.6) +
  geom_smooth(se = FALSE, color = "red", linetype = "solid",
              # making span > 0.75 (default) makes this less wiggly so we can better see the general trend 
              method = "loess", 
              span = .8
              ) +
  labs(x = "Distance (meters)", 
       y = "Semi-variance",
       title = "Semi-variogram Cloud with Smoothed Curve") +
  theme_bw() +
  theme(legend.position = "none")
`geom_smooth()` using formula = 'y ~ x'

Ordinary Kriging (OK) - Brief Example

Kriging provides predictions at new locations not used in model fitting. We’ll use the krige function, specifying the prediction locations with the newdata = argument.

Fitting a Variogram Model

First, we fit a variogram model to use in the model = option for kriging. In this example, we perform ordinary kriging, assuming a common mean across locations.

To parameterize the structured error in kriging, we need an estimated variogram model that provides initial values for the partial sill (σ²) and range (ϕ). The following code evaluates three variogram models (exponential, spherical, and Matern), selecting the best fit.

# ---- Modeled Variogram ----

# Estimate the empirical variogram
##  By default, the variogram() function limits the maximum lag distance. Increasing the cutoff parameter will allow it to calculate semivariance values over a larger distance, which might help the semivariogram level off if it's naturally reaching a sill
v <- variogram(ln_nox ~ 1, data=fall, cutoff=30)
# as before:
plot(v)

# Fit a model to the variogram, trying exponential, spherical, and Matern options
v.fit <- fit.variogram(v, vgm(c("Exp", "Sph", "Mat")))

# Display the selected variogram model and its parameters (sill, range, nugget)
# Note: The spherical model (Sph) was selected with the following parameters:
#       - Range: 31.69 meters
#       - Partial sill (structured variance): 0.129
#       - Nugget (small-scale variability): 0.0195
v.fit
  model      psill   range
1   Nug 0.01943671  0.0000
2   Sph 0.11323955 27.0123
# Plot the empirical variogram with the fitted model overlaid
# consider expanding the x-axis here
plot(v, v.fit)

Predict with Ordinary Kriging

This code demonstrates ordinary kriging to predict ln_nox at new locations (from la_grid) using the fitted variogram model (v.fit).

# ---- Ordinary Kriging ----

# Ordinary kriging of ln_nox
# First two arguments: formula and data
lnox.kr <- krige(ln_nox ~ 1, fall, newdata = la_grid, model = v.fit)
[using ordinary kriging]
# Plot kriging predictions
# var1.pred contains the predicted values
pl1 <- plot(lnox.kr["var1.pred"], main = "OK Prediction of Log(NOx)")

# Calculate and plot kriging standard errors
# Some places have more uncertainty in their estimates
lnox.kr$se <- sqrt(lnox.kr$var1.var)  # Standard error is the square root of variance
pl2 <- plot(lnox.kr["se"], main = "OK Prediction Error (Standard Error)")

Universal Kriging (UK) - Primary Focus

Universal kriging allows us to include covariates for the fixed part of the model. Unlike ArcGIS, which traditionally limited universal kriging to latitude and longitude as predictors, R’s gstat package allows arbitrary covariates. The following code demonstrates universal kriging using covariates from Mercer to predict ln_nox.

Fitting a Variogram for Universal Kriging

Note: See convergence errors w/ m.uk. You may need to play around with starting values to help with convergence.

# ---- Universal Kriging ----

# Estimate the variogram with a covariate predictor
v.uk <- variogram(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                  cutoff=30)

# plot(v.uk)

# Fit the variogram model with multiple options (Exponential, Spherical, Matern)
# note that this has convergence issues. we'll try one model instead. you can also give it initial values for range, nugget etc. if you have an understanding of what these might be

# has convergence issues if you don't extend the variogram() cutoff. 
m.uk <- fit.variogram(v.uk, vgm(c("Exp", "Sph", "Mat")))

# Alternatively, you could fit the variogram with modified initial values
# m.uk <- fit.variogram(v.uk, vgm("Sph", nugget = 0.02, psill = 3, range = 30))
# m.uk <- fit.variogram(v.uk, vgm("Sph", nugget = 0.01, psill = 1, range = 50))


# Display the selected variogram model parameters
m.uk
  model      psill   range
1   Nug 0.01808767  0.0000
2   Sph 0.02137579 35.1012
# Plot the empirical variogram with the fitted model
plot(v.uk, model = m.uk)

Predict with Universal Kriging

The following code demonstrates universal kriging to predict ln_nox on the grid (la_grid), using covariates from Mercer and the fitted variogram model (m.uk).

Side Note on evaluating linear relationships: To evaluate a covariate for the mean model in universal kriging, we could examine the relationship between each variable (e.g., Pop_5000 - population within 5000 meters) and ln_nox (log-transformed NOx levels). If the relationship appears close to linear, we can include this variable a covariate in the mean model without any transformations. We only show this once for illustrative purposes. Consider repeating this for other variables to identify whether transformations may be needed.

# Plot to evaluate linearity between Pop_5000 and ln_nox
ggplot(fall, aes(x = Pop_5000, y = ln_nox)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth() +
  labs(x = "Population within 5000 meters", y = "Log-transformed NOx", 
       title = "Relationship between Pop_5000 and ln_nox")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

We’ll fit a model following something similar to the Mercer approach.

# ---- Universal Kriging Prediction ----

# Fit the universal kriging model and predict on the grid
lnox.kr <- krige(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                 newdata = la_grid, 
                 model = m.uk)
[using universal kriging]
# Calculate standard errors to assess prediction uncertainty across locations
lnox.kr$se <- sqrt(lnox.kr$var1.var)  # Standard error is the square root of variance

# Plot UK predictions
pl3 <- plot(lnox.kr["var1.pred"], main = "UK Prediction of Log(NOx)")

# Plot UK prediction standard errors
pl4 <- plot(lnox.kr["se"], main = "UK Prediction Error (Standard Error)")

Cross-Validation with gstat

The gstat package offers krige.cv for performing kriging with cross-validation. By default, it uses leave-one-out (LOO) cross-validation, but you can specify nfold for k-fold cross-validation. If nfold is a scalar, it divides the data randomly into that many folds. Alternatively, you can pass a vector of group identifiers to control the folds (e.g., clusters in the data).

We first define two helper functions for later use: krige.cv.bubble to create a bubble plot of kriging residuals and krige.cv.stats to calculate model performance metrics (RMSE and R²). Then we demonstrate ordinary kriging (OK) and universal kriging (UK) with both 5-fold CV and LOO. Typically, OK performs better with LOO, while UK shows minimal change between CV approaches.

# ---- Define Cross-Validation Functions ----

# Wrapper function krige.cv2() to retain the projection of the sf object.
# This fixes a known bug in krige.cv() where projection information is lost.
# (Bug reported and fixed on GitHub, but this wrapper may be required for now.)
krige.cv2 <- function(formula, locations, model = NULL, ..., beta = NULL, 
                      nmax = Inf, nmin = 0, maxdist = Inf, 
                      nfold = nrow(locations),  # default is leave-one-out
                      verbose = interactive(), 
                      debug.level = 0) {
  
  # Perform cross-validation and retain projection if it's missing
  krige.cv1 <- krige.cv(formula = formula, locations = locations, model = model, ..., 
                        beta = beta, nmax = nmax, nmin = nmin, maxdist = maxdist, 
                        nfold = nfold, verbose = verbose, debug.level = debug.level)
  
  # Set projection from input data if krige.cv output lacks it
  if (is.na(st_crs(krige.cv1))) {
    st_crs(krige.cv1) <- st_crs(locations)
  }
  return(krige.cv1)
}

# Function to create a bubble plot for kriging residuals
krige.cv.bubble <- function(cv.out, plot_title) {
  ggplot(data = cv.out) +
    geom_sf(aes(size = abs(residual), color = factor(residual > 0)), alpha = 0.5) +
    scale_color_discrete(name = 'Residual > 0', direction = -1) +
    scale_size_continuous(name = '|Residual|') +
    ggtitle(plot_title) +
    theme_bw()
}

# Function to calculate performance metrics: RMSE and R²
krige.cv.stats <- function(krige.cv.output, description) {
  d <- krige.cv.output
  
  # Calculate Mean Squared Error (MSE) and R²
  mean_observed <- mean(d$observed)
  MSE_pred <- mean((d$observed - d$var1.pred)^2)
  MSE_obs <- mean((d$observed - mean_observed)^2)
  
  # Create a summary table with rounded RMSE and MSE-based R²
  tibble(
    Description = description, 
    RMSE = round(sqrt(MSE_pred), 4), 
    MSE_based_R2 = round(max(1 - MSE_pred / MSE_obs, 0), 4)
  )
}

Generate cross-validated predictions.

For UK, we’ll use the covariates reported by Mercer et al. (2011).

# ---- Cross-Validation ----

# Perform Ordinary Kriging (OK) with 5-fold Cross-Validation
cv5 <- krige.cv2(ln_nox ~ 1, fall, model = v.fit, 
                 nfold = 5)

# Plot residuals for OK with 5-fold CV (We'll only show this once for illustration. There are better ways of comparing residuals over space from different models in separate plots.)
krige.cv.bubble(cv.out = cv5, 
                plot_title = "log(NOx) OK Results: 5-Fold CV Residuals")

# Perform Ordinary Kriging (OK) with Leave-One-Out Cross-Validation (LOOCV)
cvLOO <- krige.cv2(ln_nox ~ 1, fall, model = v.fit)


# Perform Universal Kriging (UK) with 5-fold Cross-Validation
cv5uk <- krige.cv2(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                   model = m.uk, 
                   nfold = 5)

# Perform Universal Kriging (UK) with Leave-One-Out Cross-Validation (LOOCV)
cvLOOuk <- krige.cv2(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                     model = m.uk
                     )

# Calculate and compare performance statistics across cross-validation methods
# Compile results into a summary table
bind_rows(
  krige.cv.stats(cv5, "OK: 5-Fold CV"),
  krige.cv.stats(cvLOO, "OK: LOO CV"),
  krige.cv.stats(cv5uk, "UK: 5-Fold CV"),
  krige.cv.stats(cvLOOuk, "UK: LOO CV")
) %>% 
  kable(caption = "Summary of Kriging Cross-Validation Results for log(NOx)")
Summary of Kriging Cross-Validation Results for log(NOx)
Description RMSE MSE_based_R2
OK: 5-Fold CV 0.1628 0.7353
OK: LOO CV 0.1611 0.7407
UK: 5-Fold CV 0.1470 0.7840
UK: LOO CV 0.1411 0.8010

Predict at New Locations in Los Angeles

This code performs universal kriging with Pop_5000 as a covariate, predicting ln_nox values at new locations in la_grid.

# ----- Krige in LA -----

kc_la <- krige(
  ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm,                  # Mean model with covariates
  st_transform(fall, lambert_proj),   # Transform `fall` data to Lambert projection
  st_transform(la_grid, lambert_proj),# Transform `la_grid` to Lambert projection
  model = m.uk                        # Use fitted universal kriging variogram model
)
[using universal kriging]
# put these on the native scale
kc_la$var1.pred <- exp(kc_la$var1.pred)

# View the results; predictions are stored in `var1.pred`

Visualize the Predictions

We’ll display kriging predictions for NOx on a map. First, we transform the predictions to latitude-longitude coordinates, then join them with la_grid to ensure each predicted value aligns with the original coordinates. Finally, we overlay the predictions on a background map.

# ----- Plot the Grid Predictions on the Map -----

# Reproject predictions from Lambert to latitude-longitude before plotting
kc_la <- st_transform(kc_la, latlong_proj)

# Verify that coordinates in `kc_la` and `la_grid` are nearly identical
# Floating-point precision can cause slight differences, so `all.equal()`, allows for small tolerance levels in comparison, is used instead of `identical()`.
all.equal(st_coordinates(kc_la), st_coordinates(la_grid))  # Expected result: "TRUE" or a message with small differences
[1] TRUE
# Join LA grid to predictions; do so by nearest feature to avoid precision merging issues
new_grid <- st_join(la_grid, kc_la, join = st_nearest_feature)

# Alternative join using a small tolerance (commented out as it takes longer)
# new_grid <- st_join(la_grid, kc_la, join = st_equals_exact, par = 1e-10)

# Transform predictions from log scale back to the native scale
new_grid <- new_grid %>% rename(NOx = var1.pred)

# Verify the join was successful (no NAs in predictions)
all(!is.na(new_grid$var1.pred))  # Should return "TRUE"
[1] TRUE

Plot the point (not smooth) predictions with NOx on a map, highlighting highways or other spatial patterns

g + 
  geom_sf(data = new_grid, aes(color = NOx), alpha = 0.1) +
  # color friendly color scale
  scale_color_viridis_c(option = "plasma") + 
  ggtitle("Map of Los Angeles with Fall UK Predictions Overlaid as Points")
Zoom: 10

Plot smooth gridded predictions. You’ll have to play around with the tile size and transparency (alpha).

Example using ggplot2::geom_tile()

# ----- Plot Smooth Gridded Predictions -----

# Convert `new_grid` to a data frame for easier plotting 
new_grid_df <- as.data.frame(st_coordinates(new_grid))
new_grid_df$NOx <- new_grid$NOx


g + 
  # Set map extent and CRS (bounding box error otherwise results in no background map) 
  coord_sf(xlim = c(map_bbox["xmin"], map_bbox["xmax"]), 
           ylim = c(map_bbox["ymin"], map_bbox["ymax"]), 
           crs = 4326) +  
  geom_tile(data = new_grid_df, aes(x = X, y = Y, fill = NOx), 
            alpha=0.2,
            width = 0.01, height = 0.01 # Adjust width and height as needed
            ) +  
  # color friendly color scale
  scale_fill_viridis_c(option = "plasma") + 
  labs(title = "Map of Los Angeles with Fall UK Predictions (Smoother)",
       col="NOx (ppb)"
       ) +
  theme_minimal()
Loading required namespace: raster
Zoom: 10

Alternative example using maptiles and terra (raster)

# Define the raster resolution (adjust as needed for smoothness)
resolution <- 0.005

# Create a blank raster template using the bounding box of kc_la and specify resolution and CRS
# Use `ext()` to get the bounding box in a format `terra` accepts
kc_rast <- rast(
  ext(kc_la),            # Set extent based on kc_la's bounding box
  resolution = resolution,
  crs = st_crs(kc_la)$wkt  # Use WKT for CRS
)

# Rasterize the sf object (kc_la) using `var1.pred` as the value to fill the raster
kc_rast <- rasterize(kc_la, kc_rast, field = "var1.pred")

# Convert raster to a data frame for ggplot2 compatibility
kc_df <- as.data.frame(kc_rast, xy = TRUE)  # Include coordinates (x, y)
names(kc_df)[3] <- "var1.pred"  # Rename to match the prediction column

# Plot 
## background map
tiles <- maptiles::get_tiles(x = st_bbox(la_grid), provider = "OpenStreetMap")
# Crop tiles to the bounding box of la_grid. In theory, you shouldn't have to do this, but the reuslting map is too wide. It may occur because of how the tiles are fetched.
tiles <- crop(tiles, ext(map_bbox$xmin, map_bbox$xmax, map_bbox$ymin, map_bbox$ymax))

ggplot() +
  # Add basemap tiles as background
  layer_spatial(tiles)  + 
  # add the smoothed over predictions
  geom_raster(data = kc_df, aes(x = x, y = y, fill = var1.pred), alpha=0.6) +
  # color friendly color scale
  scale_fill_viridis_c(option = "plasma") + 
  labs(title = "Map of Los Angeles with Fall UK Predictions (Smoother)",
       fill="NOx (ppb)"
       ) +
  theme_minimal()

Other

Ordinary kriging using the residuals from a LUR

The idea is to use a traditional linear model to fit the common model, save the residuals from this model, and then import the residuals into gstat and fit an OK model. This is a homework problem.

Practice Session

During class we will review the output above. Please come prepared to ask questions.

Homework Exercises

Use the snapshot data and focus on the winter season.

  1. Repeat the universal kriging model approach shown above, this time using the winter season. Also repeat the cross-validation. Check whether your performance statistics agree with those reported by Mercer et al. Discuss.

  2. Fit a LUR model (using the common model covariates) to the winter season snapshot data. Take the residuals from this model and evaluate them:

    1. Estimate an empirical binned variogram to the residuals using default bins and the same cutoff we set above. Overlay with a modeled variogram. Plot this and discuss, including similarities and differences with the variogram estimated for the UK model you fit in exercise 1.
    2. Discuss what you have learned from plotting the variograms. Is there evidence of spatial structure in the data? Do you get different insights from each variogram?
  3. Optional extra credit: Using 10-fold cross-validation with the cluster variable to define the CV groups, estimate predictions from a 2-step model using the common model covariates. For this model you need to separately create cross-validated predictions from the LUR model and from the OK model of the LUR residuals (of the winter season dataset), and sum these to compare with the observed ln_nox data.

  4. Write a basic summary of your understanding of how universal kriging differs from land use regression. What additional insights do you get from the Mercer et al paper now that you have done some kriging analyses using the snapshot data?

  5. Discuss your thoughts on the most useful summaries to show from an exposure prediction analysis.

Appendix

Session information

#-----session information-----

# print R session information
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 15.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] downloader_0.4                akima_0.6-3.4                
 [3] scales_1.3.0                  Hmisc_5.1-3                  
 [5] gstat_2.1-2                   knitr_1.48                   
 [7] sf_1.0-17                     rnaturalearth_1.0.1          
 [9] prettymapr_0.2.5              terra_1.7-78                 
[11] maptiles_0.8.0                ggspatial_1.1.9              
[13] lubridate_1.9.3               forcats_1.0.0                
[15] stringr_1.5.1                 dplyr_1.1.4                  
[17] purrr_1.0.2                   readr_2.1.5                  
[19] tidyr_1.3.1                   tibble_3.2.1                 
[21] ggplot2_3.5.1                 tidyverse_2.0.0              
[23] rnaturalearthhires_1.0.0.9000 pacman_0.5.1                 

loaded via a namespace (and not attached):
 [1] DBI_1.2.3          gridExtra_2.3      s2_1.1.7           rlang_1.1.4       
 [5] magrittr_2.0.3     e1071_1.7-16       compiler_4.3.1     mgcv_1.9-1        
 [9] png_0.1-8          vctrs_0.6.5        pkgconfig_2.0.3    wk_0.9.3          
[13] crayon_1.5.3       fastmap_1.2.0      backports_1.5.0    labeling_0.4.3    
[17] utf8_1.2.4         rmarkdown_2.28     tzdb_0.4.0         bit_4.5.0         
[21] xfun_0.48          jsonlite_1.8.9     rosm_0.3.0         parallel_4.3.1    
[25] cluster_2.1.6      R6_2.5.1           stringi_1.8.4      rpart_4.1.23      
[29] stars_0.6-6        Rcpp_1.0.13        zoo_1.8-12         base64enc_0.1-3   
[33] FNN_1.1.4.1        Matrix_1.6-1       splines_4.3.1      nnet_7.3-19       
[37] timechange_0.3.0   tidyselect_1.2.1   rstudioapi_0.16.0  abind_1.4-8       
[41] yaml_2.3.10        codetools_0.2-20   curl_5.2.3         lattice_0.22-6    
[45] intervals_0.15.5   plyr_1.8.9         withr_3.0.1        evaluate_1.0.0    
[49] foreign_0.8-87     units_0.8-5        proxy_0.4-27       xts_0.14.0        
[53] pillar_1.9.0       KernSmooth_2.23-24 checkmate_2.3.2    generics_0.1.3    
[57] vroom_1.6.5        sp_2.1-4           spacetime_1.3-2    hms_1.1.3         
[61] munsell_0.5.1      class_7.3-22       glue_1.8.0         tools_4.3.1       
[65] data.table_1.16.0  grid_4.3.1         slippymath_0.3.1   colorspace_2.1-1  
[69] nlme_3.1-166       raster_3.6-30      htmlTable_2.4.3    Formula_1.2-5     
[73] cli_3.6.3          fansi_1.0.6        viridisLite_0.4.2  gtable_0.3.5      
[77] digest_0.6.37      classInt_0.4-10    htmlwidgets_1.6.4  farver_2.1.2      
[81] htmltools_0.5.8.1  lifecycle_1.0.4    httr_1.4.7         bit64_4.5.2       

Embedded code

# Clear workspace of all objects and unload non-base packages
rm(list = ls(all = TRUE))
if (!is.null(sessionInfo()$otherPkgs)) {
    suppressWarnings(
        lapply(paste('package:', names(sessionInfo()$otherPkgs), sep=""),
               detach, character.only=TRUE, unload=TRUE, force=TRUE)
    )
}

# Load or install 'pacman' for package management
my_repo <- 'http://cran.r-project.org'
if (!require("pacman")) {
    install.packages("pacman", repos = my_repo)
}

# **SPH server**: need to install rnaturalearthhires like so on the SPH server
if (!require("rnaturalearthhires")) {
    install.packages("rnaturalearthhires", repos = "https://ropensci.r-universe.dev", type = "source")
}

pacman::p_load(
    tidyverse,                 # Data manipulation and visualization
    # takes a while to install on SPH
    ggspatial,                 # Geospatial extensions for ggplot.  
    maptiles, # maptiles and tmap libraries can be used instead of or in combination with ggplot + ggspatial. maptiles offers more tile-based map flexibility; ggspatial provides the ability to annotate maps easily; tmap offers both static and interactive maps that we won't review in this course. 
    terra, # alternative mapping with raster files
    
    # need for SPH server?
    prettymapr,
    
    rnaturalearth,             # Land features for map layers (remove water locations)
    rnaturalearthhires,        # High-resolution land features 
    sf,                        # Handling spatial objects (modern replacement for 'sp')
    knitr,                     # Formatting tables with kable()
    gstat,                     # Geostatistical methods (e.g., kriging)
    Hmisc,                     # Data description functions like describe()
    scales,                    # Color scale customization for ggplot
    akima,                     # Bivariate interpolation for irregular data
    downloader                 # Downloading files over HTTP/HTTPS
)


# **Mac Users**: If you encounter issues with 'rgdal' on macOS Catalina or newer,
# you may need to install GDAL via terminal commands. Instructions are available [here](https://medium.com/@egiron/how-to-install-gdal-and-qgis-on-macos-catalina-ca690dca4f91).


# create "Datasets" directory if one does not already exist    
dir.create(file.path("Datasets"), showWarnings=FALSE, recursive = TRUE)

# specify data path
data_path <- file.path("Datasets")

# specify the file names and paths
snapshot.file <- "snapshot_3_5_19.csv"
snapshot.path <- file.path(data_path, snapshot.file)
grid.file <- "la_grid_3_5_19.csv"
grid.path <- file.path(data_path, grid.file)

# Download the file if it is not already present
if (!file.exists(snapshot.path)) {
    url <- paste("https://faculty.washington.edu/sheppard/envh556/Datasets", 
                 snapshot.file, sep = '/')
    download.file(url = url, destfile = snapshot.path)
}
if (!file.exists(grid.path)) {
    url <- paste("https://faculty.washington.edu/sheppard/envh556/Datasets", 
                 grid.file, sep = '/')
    download.file(url = url, destfile = grid.path)
}

# Output a warning message if the file cannot be found
if (file.exists(snapshot.path)) {
    snapshot <- read_csv(file = snapshot.path)
} else warning(paste("Can't find", snapshot.file, "!"))

if (file.exists(grid.path)) {
    la_grid <- read_csv(file = grid.path)
} else warning(paste("Can't find", grid.file, "!"))

# remove temporary variables
rm(url, file_name, file_path, data_path)

# ----- basics ----- 

head(snapshot)

# glimpse and str are both useful to learn the structure.  I like glimpse from the `dplyr` package, particularly once this becomes converted to a spatial dataset
str(snapshot)
glimpse(snapshot)

# summary of the data
summary(snapshot) 

#---- Read fall snapshot as an sf object -----

# Define coordinate systems using EPSG codes. Here we use numbers and strings as an example.  
# WGS84 latitude-longitude. same as: "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
latlong_proj <- 4326  
# projection in meters we need for distance calculations
lambert_proj <- "+proj=lcc +lat_1=33 +lat_2=45 +lat_0=39 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs" 


# Filter for fall season and select relevant covariates
fall <- snapshot %>%
    filter(seasonfac == "2Fall") %>%
    select(ID, latitude, longitude, ln_nox, D2A1, A1_50, A23_400, Pop_5000, D2C, Int_3000, D2Comm, cluster, group_loc, FieldID) %>%
    as.data.frame()

# Convert to sf object, specifying coordinate reference system (CRS)
fall <- st_as_sf(fall, coords = c('longitude', 'latitude'), crs = latlong_proj)

# Summarize and view the structure
fall

# Preview the first 10 raw XY (long-lat) coordinates
st_coordinates(fall) %>% head(10)

#----- Convert LA grid to sf -----

# Check initial class of la_grid
class(la_grid)

# Filter out rows with -Inf in D2A1, remove redundant lambert columns, and convert to sf
la_grid <- la_grid %>%
  filter(D2A1 != -Inf) %>%
  select(-lambert_x, -lambert_y) %>%
  st_as_sf(coords = c('longitude', 'latitude'), crs = latlong_proj)

# Verify class and summary of the converted object
class(la_grid)
la_grid

# Download the land polygon data as an sf multipolygon
land <- ne_download(scale = "large", type = "land", category = "physical", returnclass = "sf")

# Crop the land area to the bounding box of the LA grid to reduce processing time
land <- suppressWarnings(st_crop(land, st_bbox(la_grid)))

# Visualize cropped land area (optional)
ggplot(land) + geom_sf()

# Filter la_grid to keep only points that intersect with land
la_grid <- la_grid[st_within(la_grid, land) %>% lengths() > 0,]

# Visualize grid land locations (zoom in)
ggplot(la_grid) + geom_sf(size=0.001)

# ---- LA Map Setup ----
# Define a bounding box (min & max X and Y) with a 10,000m buffer around `la_grid`
map_bbox <- la_grid %>%
  # convert from degrees to meters
  st_transform(crs = lambert_proj) %>%
  # add a buffer around the area for visualization purposes
  st_buffer(dist = 10000) %>%
  # convert back to original CRS
  st_transform(crs = latlong_proj) %>%
  # take the min/max X/Y
  st_bbox()

map_bbox

# Base map setup with ggplot2 and ggspatial using OSM tiles
g <- ggplot() +
  ggspatial::annotation_map_tile(type = "osm", zoom = 10) +
  labs(title = "LA Grid with Map") +
  theme_minimal()

# # alternative background map with the maptiles package
# tiles <- maptiles::get_tiles(x = st_bbox(la_grid), provider = "OpenStreetMap")
# g <- ggplot() +
#   # Add basemap tiles as background
#   layer_spatial(tiles)

# Plot background map and the LA grid (zoom in)
g + 
  geom_sf(data = la_grid, size=0.001)


# Add NOx data (transformed to original scale) with additional map elements
g + 
  geom_sf(data = fall, aes(color = exp(ln_nox))) + 
  scale_color_viridis_c() +  # Color-friendly scale
  theme_void() +  # Clean layout for map aesthetics
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  ) + 
  ggspatial::annotation_scale(location = "bl", width_hint = 0.3, unit_category = "imperial") +  # Scale in miles
  ggspatial::annotation_north_arrow(location = "tr", which_north = "true", style = north_arrow_fancy_orienteering) +  # North arrow
  labs(title = "Map of Los Angeles with the\nFall Snapshot Data",
       col="NOx (ppb)"
       )

# ---- Empirical Variogram Plots ----

# Variogram Cloud
vgm_fall <- variogram(ln_nox ~ 1, fall, cloud = TRUE)

plot(vgm_fall)

# Binned Variogram (default)
plot(variogram(ln_nox ~ 1, fall))

# Binned Variogram with Point Counts
plot(variogram(ln_nox ~ 1, fall), pl = TRUE)

# Smoothed Cloud Variogram using ggplot2
ggplot(data = vgm_fall, aes(x = dist, y = gamma)) +
  geom_point(shape = 1, alpha = 0.6) +
  geom_smooth(se = FALSE, color = "red", linetype = "solid",
              # making span > 0.75 (default) makes this less wiggly so we can better see the general trend 
              method = "loess", 
              span = .8
              ) +
  labs(x = "Distance (meters)", 
       y = "Semi-variance",
       title = "Semi-variogram Cloud with Smoothed Curve") +
  theme_bw() +
  theme(legend.position = "none")

# ---- Modeled Variogram ----

# Estimate the empirical variogram
##  By default, the variogram() function limits the maximum lag distance. Increasing the cutoff parameter will allow it to calculate semivariance values over a larger distance, which might help the semivariogram level off if it's naturally reaching a sill
v <- variogram(ln_nox ~ 1, data=fall, cutoff=30)
# as before:
plot(v)

# Fit a model to the variogram, trying exponential, spherical, and Matern options
v.fit <- fit.variogram(v, vgm(c("Exp", "Sph", "Mat")))

# Display the selected variogram model and its parameters (sill, range, nugget)
# Note: The spherical model (Sph) was selected with the following parameters:
#       - Range: 31.69 meters
#       - Partial sill (structured variance): 0.129
#       - Nugget (small-scale variability): 0.0195
v.fit

# Plot the empirical variogram with the fitted model overlaid
# consider expanding the x-axis here
plot(v, v.fit)

# ---- Ordinary Kriging ----

# Ordinary kriging of ln_nox
# First two arguments: formula and data
lnox.kr <- krige(ln_nox ~ 1, fall, newdata = la_grid, model = v.fit)

# Plot kriging predictions
# var1.pred contains the predicted values
pl1 <- plot(lnox.kr["var1.pred"], main = "OK Prediction of Log(NOx)")

# Calculate and plot kriging standard errors
# Some places have more uncertainty in their estimates
lnox.kr$se <- sqrt(lnox.kr$var1.var)  # Standard error is the square root of variance
pl2 <- plot(lnox.kr["se"], main = "OK Prediction Error (Standard Error)")

# ---- Universal Kriging ----

# Estimate the variogram with a covariate predictor
v.uk <- variogram(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                  cutoff=30)

# plot(v.uk)

# Fit the variogram model with multiple options (Exponential, Spherical, Matern)
# note that this has convergence issues. we'll try one model instead. you can also give it initial values for range, nugget etc. if you have an understanding of what these might be

# has convergence issues if you don't extend the variogram() cutoff. 
m.uk <- fit.variogram(v.uk, vgm(c("Exp", "Sph", "Mat")))

# Alternatively, you could fit the variogram with modified initial values
# m.uk <- fit.variogram(v.uk, vgm("Sph", nugget = 0.02, psill = 3, range = 30))
# m.uk <- fit.variogram(v.uk, vgm("Sph", nugget = 0.01, psill = 1, range = 50))


# Display the selected variogram model parameters
m.uk

# Plot the empirical variogram with the fitted model
plot(v.uk, model = m.uk)

# Plot to evaluate linearity between Pop_5000 and ln_nox
ggplot(fall, aes(x = Pop_5000, y = ln_nox)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth() +
  labs(x = "Population within 5000 meters", y = "Log-transformed NOx", 
       title = "Relationship between Pop_5000 and ln_nox")

# ---- Universal Kriging Prediction ----

# Fit the universal kriging model and predict on the grid
lnox.kr <- krige(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                 newdata = la_grid, 
                 model = m.uk)

# Calculate standard errors to assess prediction uncertainty across locations
lnox.kr$se <- sqrt(lnox.kr$var1.var)  # Standard error is the square root of variance

# Plot UK predictions
pl3 <- plot(lnox.kr["var1.pred"], main = "UK Prediction of Log(NOx)")

# Plot UK prediction standard errors
pl4 <- plot(lnox.kr["se"], main = "UK Prediction Error (Standard Error)")

# ---- Define Cross-Validation Functions ----

# Wrapper function krige.cv2() to retain the projection of the sf object.
# This fixes a known bug in krige.cv() where projection information is lost.
# (Bug reported and fixed on GitHub, but this wrapper may be required for now.)
krige.cv2 <- function(formula, locations, model = NULL, ..., beta = NULL, 
                      nmax = Inf, nmin = 0, maxdist = Inf, 
                      nfold = nrow(locations),  # default is leave-one-out
                      verbose = interactive(), 
                      debug.level = 0) {
  
  # Perform cross-validation and retain projection if it's missing
  krige.cv1 <- krige.cv(formula = formula, locations = locations, model = model, ..., 
                        beta = beta, nmax = nmax, nmin = nmin, maxdist = maxdist, 
                        nfold = nfold, verbose = verbose, debug.level = debug.level)
  
  # Set projection from input data if krige.cv output lacks it
  if (is.na(st_crs(krige.cv1))) {
    st_crs(krige.cv1) <- st_crs(locations)
  }
  return(krige.cv1)
}

# Function to create a bubble plot for kriging residuals
krige.cv.bubble <- function(cv.out, plot_title) {
  ggplot(data = cv.out) +
    geom_sf(aes(size = abs(residual), color = factor(residual > 0)), alpha = 0.5) +
    scale_color_discrete(name = 'Residual > 0', direction = -1) +
    scale_size_continuous(name = '|Residual|') +
    ggtitle(plot_title) +
    theme_bw()
}

# Function to calculate performance metrics: RMSE and R²
krige.cv.stats <- function(krige.cv.output, description) {
  d <- krige.cv.output
  
  # Calculate Mean Squared Error (MSE) and R²
  mean_observed <- mean(d$observed)
  MSE_pred <- mean((d$observed - d$var1.pred)^2)
  MSE_obs <- mean((d$observed - mean_observed)^2)
  
  # Create a summary table with rounded RMSE and MSE-based R²
  tibble(
    Description = description, 
    RMSE = round(sqrt(MSE_pred), 4), 
    MSE_based_R2 = round(max(1 - MSE_pred / MSE_obs, 0), 4)
  )
}

# ---- Cross-Validation ----

# Perform Ordinary Kriging (OK) with 5-fold Cross-Validation
cv5 <- krige.cv2(ln_nox ~ 1, fall, model = v.fit, 
                 nfold = 5)

# Plot residuals for OK with 5-fold CV (We'll only show this once for illustration. There are better ways of comparing residuals over space from different models in separate plots.)
krige.cv.bubble(cv.out = cv5, 
                plot_title = "log(NOx) OK Results: 5-Fold CV Residuals")

# Perform Ordinary Kriging (OK) with Leave-One-Out Cross-Validation (LOOCV)
cvLOO <- krige.cv2(ln_nox ~ 1, fall, model = v.fit)


# Perform Universal Kriging (UK) with 5-fold Cross-Validation
cv5uk <- krige.cv2(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                   model = m.uk, 
                   nfold = 5)

# Perform Universal Kriging (UK) with Leave-One-Out Cross-Validation (LOOCV)
cvLOOuk <- krige.cv2(ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm, fall, 
                     model = m.uk
                     )

# Calculate and compare performance statistics across cross-validation methods
# Compile results into a summary table
bind_rows(
  krige.cv.stats(cv5, "OK: 5-Fold CV"),
  krige.cv.stats(cvLOO, "OK: LOO CV"),
  krige.cv.stats(cv5uk, "UK: 5-Fold CV"),
  krige.cv.stats(cvLOOuk, "UK: LOO CV")
) %>% 
  kable(caption = "Summary of Kriging Cross-Validation Results for log(NOx)")

# ----- Krige in LA -----

kc_la <- krige(
  ln_nox ~ D2A1 + A1_50 + A23_400 + Pop_5000 + D2C + Int_3000 + D2Comm,                  # Mean model with covariates
  st_transform(fall, lambert_proj),   # Transform `fall` data to Lambert projection
  st_transform(la_grid, lambert_proj),# Transform `la_grid` to Lambert projection
  model = m.uk                        # Use fitted universal kriging variogram model
)

# put these on the native scale
kc_la$var1.pred <- exp(kc_la$var1.pred)

# View the results; predictions are stored in `var1.pred`

# ----- Plot the Grid Predictions on the Map -----

# Reproject predictions from Lambert to latitude-longitude before plotting
kc_la <- st_transform(kc_la, latlong_proj)

# Verify that coordinates in `kc_la` and `la_grid` are nearly identical
# Floating-point precision can cause slight differences, so `all.equal()`, allows for small tolerance levels in comparison, is used instead of `identical()`.
all.equal(st_coordinates(kc_la), st_coordinates(la_grid))  # Expected result: "TRUE" or a message with small differences
 
# Join LA grid to predictions; do so by nearest feature to avoid precision merging issues
new_grid <- st_join(la_grid, kc_la, join = st_nearest_feature)

# Alternative join using a small tolerance (commented out as it takes longer)
# new_grid <- st_join(la_grid, kc_la, join = st_equals_exact, par = 1e-10)

# Transform predictions from log scale back to the native scale
new_grid <- new_grid %>% rename(NOx = var1.pred)

# Verify the join was successful (no NAs in predictions)
all(!is.na(new_grid$var1.pred))  # Should return "TRUE"

g + 
  geom_sf(data = new_grid, aes(color = NOx), alpha = 0.1) +
  # color friendly color scale
  scale_color_viridis_c(option = "plasma") + 
  ggtitle("Map of Los Angeles with Fall UK Predictions Overlaid as Points")


# ----- Plot Smooth Gridded Predictions -----

# Convert `new_grid` to a data frame for easier plotting 
new_grid_df <- as.data.frame(st_coordinates(new_grid))
new_grid_df$NOx <- new_grid$NOx


g + 
  # Set map extent and CRS (bounding box error otherwise results in no background map) 
  coord_sf(xlim = c(map_bbox["xmin"], map_bbox["xmax"]), 
           ylim = c(map_bbox["ymin"], map_bbox["ymax"]), 
           crs = 4326) +  
  geom_tile(data = new_grid_df, aes(x = X, y = Y, fill = NOx), 
            alpha=0.2,
            width = 0.01, height = 0.01 # Adjust width and height as needed
            ) +  
  # color friendly color scale
  scale_fill_viridis_c(option = "plasma") + 
  labs(title = "Map of Los Angeles with Fall UK Predictions (Smoother)",
       col="NOx (ppb)"
       ) +
  theme_minimal()

# Define the raster resolution (adjust as needed for smoothness)
resolution <- 0.005

# Create a blank raster template using the bounding box of kc_la and specify resolution and CRS
# Use `ext()` to get the bounding box in a format `terra` accepts
kc_rast <- rast(
  ext(kc_la),            # Set extent based on kc_la's bounding box
  resolution = resolution,
  crs = st_crs(kc_la)$wkt  # Use WKT for CRS
)

# Rasterize the sf object (kc_la) using `var1.pred` as the value to fill the raster
kc_rast <- rasterize(kc_la, kc_rast, field = "var1.pred")

# Convert raster to a data frame for ggplot2 compatibility
kc_df <- as.data.frame(kc_rast, xy = TRUE)  # Include coordinates (x, y)
names(kc_df)[3] <- "var1.pred"  # Rename to match the prediction column

# Plot 
## background map
tiles <- maptiles::get_tiles(x = st_bbox(la_grid), provider = "OpenStreetMap")
# Crop tiles to the bounding box of la_grid. In theory, you shouldn't have to do this, but the reuslting map is too wide. It may occur because of how the tiles are fetched.
tiles <- crop(tiles, ext(map_bbox$xmin, map_bbox$xmax, map_bbox$ymin, map_bbox$ymax))

ggplot() +
  # Add basemap tiles as background
  layer_spatial(tiles)  + 
  # add the smoothed over predictions
  geom_raster(data = kc_df, aes(x = x, y = y, fill = var1.pred), alpha=0.6) +
  # color friendly color scale
  scale_fill_viridis_c(option = "plasma") + 
  labs(title = "Map of Los Angeles with Fall UK Predictions (Smoother)",
       fill="NOx (ppb)"
       ) +
  theme_minimal()

#-----session information-----

# print R session information
sessionInfo()

#-----code appendix-----
#-----functions-----

# Show the names of all functions defined in the .Rmd
# (e.g. loaded in the environment)
lsf.str()

# Show the definitions of all functions loaded into the current environment  
lapply(c(lsf.str()), getAnywhere)

Functions defined

#-----functions-----

# Show the names of all functions defined in the .Rmd
# (e.g. loaded in the environment)
lsf.str()
krige.cv.bubble : function (cv.out, plot_title)  
krige.cv.stats : function (krige.cv.output, description)  
krige.cv2 : function (formula, locations, model = NULL, ..., beta = NULL, nmax = Inf, 
    nmin = 0, maxdist = Inf, nfold = nrow(locations), verbose = interactive(), 
    debug.level = 0)  
# Show the definitions of all functions loaded into the current environment  
lapply(c(lsf.str()), getAnywhere)
Warning in findGeneric(f, envir): 'krige' is a formal generic function; S3
methods will not likely be found
Warning in findGeneric(f, envir): 'krige.cv' is a formal generic function; S3
methods will not likely be found
Warning in findGeneric(f, envir): 'krige' is a formal generic function; S3
methods will not likely be found
Warning in findGeneric(f, envir): 'krige.cv' is a formal generic function; S3
methods will not likely be found
Warning in findGeneric(f, envir): 'krige' is a formal generic function; S3
methods will not likely be found
[[1]]
A single object matching 'krige.cv.bubble' was found
It was found in the following places
  .GlobalEnv
with value

<srcref: file "" chars 25:20 to 32:1>

[[2]]
A single object matching 'krige.cv.stats' was found
It was found in the following places
  .GlobalEnv
with value

<srcref: file "" chars 35:19 to 49:1>
<bytecode: 0x178f45040>

[[3]]
A single object matching 'krige.cv2' was found
It was found in the following places
  .GlobalEnv
with value

<srcref: file "" chars 6:14 to 22:1>
<bytecode: 0x16efb74e8>